import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas a pesar de ello no servirá en otros ejercicios ya que hay muchas númericas que son
1 o 0 y no servirá la función para otros trabajos
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if i in ["fraud_bool", "payment_type", "employment_status", "housing_status", "source", "device_os"]: #selecciono de manera manual las categoricas
#ya que hay muchas numericas que son 1 o 0 por tanto no serviria esta funcion para otro trabajo
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
df_base = pd.read_csv("../data/Base.csv")
df_base
| fraud_bool | income | name_email_similarity | prev_address_months_count | current_address_months_count | customer_age | days_since_request | intended_balcon_amount | payment_type | zip_count_4w | velocity_6h | velocity_24h | velocity_4w | bank_branch_count_8w | date_of_birth_distinct_emails_4w | employment_status | credit_risk_score | email_is_free | housing_status | phone_home_valid | phone_mobile_valid | bank_months_count | has_other_cards | proposed_credit_limit | foreign_request | source | session_length_in_minutes | device_os | keep_alive_session | device_distinct_emails_8w | device_fraud_count | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.9 | 0.166828 | -1 | 88 | 50 | 0.020925 | -1.331345 | AA | 769 | 10650.765523 | 3134.319630 | 3863.647740 | 1 | 6 | CA | 185 | 0 | BA | 1 | 0 | 24 | 0 | 500.0 | 0 | INTERNET | 3.888115 | windows | 0 | 1 | 0 | 7 |
| 1 | 1 | 0.9 | 0.296286 | -1 | 144 | 50 | 0.005418 | -0.816224 | AB | 366 | 534.047319 | 2670.918292 | 3124.298166 | 718 | 3 | CA | 259 | 1 | BA | 0 | 0 | 15 | 0 | 1500.0 | 0 | INTERNET | 31.798819 | windows | 0 | 1 | 0 | 7 |
| 2 | 1 | 0.9 | 0.044985 | -1 | 132 | 40 | 3.108549 | -0.755728 | AC | 870 | 4048.534263 | 2893.621498 | 3159.590679 | 1 | 14 | CB | 177 | 1 | BA | 0 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 4.728705 | other | 0 | 1 | 0 | 7 |
| 3 | 1 | 0.9 | 0.159511 | -1 | 22 | 50 | 0.019079 | -1.205124 | AB | 810 | 3457.064063 | 4054.908412 | 3022.261812 | 1921 | 6 | CA | 110 | 1 | BA | 0 | 1 | 31 | 1 | 200.0 | 0 | INTERNET | 2.047904 | linux | 0 | 1 | 0 | 7 |
| 4 | 1 | 0.9 | 0.596414 | -1 | 218 | 50 | 0.004441 | -0.773276 | AB | 890 | 5020.341679 | 2728.237159 | 3087.670952 | 1990 | 2 | CA | 295 | 1 | BA | 1 | 0 | 31 | 0 | 1500.0 | 0 | INTERNET | 3.775225 | macintosh | 1 | 1 | 0 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 999995 | 0 | 0.6 | 0.192631 | -1 | 104 | 40 | 0.030592 | -1.044454 | AB | 804 | 7905.711839 | 8341.468557 | 4972.635997 | 1 | 8 | CA | 75 | 1 | BC | 1 | 1 | 25 | 0 | 200.0 | 0 | INTERNET | 8.511502 | linux | 1 | 1 | 0 | 4 |
| 999996 | 0 | 0.8 | 0.322989 | 148 | 9 | 50 | 1.628119 | -1.409803 | AC | 3306 | 5391.470463 | 4955.170808 | 5022.728108 | 0 | 2 | CC | 154 | 1 | BC | 1 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 8.967865 | windows | 0 | 1 | 0 | 4 |
| 999997 | 0 | 0.8 | 0.879403 | -1 | 30 | 20 | 0.018563 | 34.692760 | AA | 1522 | 8063.102636 | 5670.654316 | 4377.196321 | 2023 | 6 | CF | 64 | 0 | BC | 0 | 1 | 11 | 0 | 200.0 | 0 | INTERNET | 8.195531 | other | 0 | 1 | 0 | 4 |
| 999998 | 0 | 0.9 | 0.762112 | -1 | 189 | 20 | 0.015352 | 94.661055 | AA | 1418 | 8092.641762 | 3982.582204 | 4394.803296 | 1678 | 6 | CA | 163 | 0 | BA | 1 | 0 | 28 | 0 | 500.0 | 0 | INTERNET | 4.336064 | windows | 1 | 1 | 0 | 4 |
| 999999 | 0 | 0.2 | 0.697452 | -1 | 321 | 20 | 2.655916 | 9.908499 | AA | 951 | 6169.630036 | 3695.308261 | 4352.334543 | 2 | 12 | CA | 36 | 1 | BE | 0 | 1 | 15 | 0 | 200.0 | 0 | INTERNET | 6.717022 | linux | 0 | 1 | 0 | 4 |
1000000 rows × 32 columns
df_base = df_base.copy()
for i in list(df_base.columns):
if i in ["prev_address_months_count", "current_address_months_count", "bank_months_count",
"session_length_in_minutes", "device_distinct_emails"]:
df_base[i] = np.where(df_base[i] == -1, np.nan, df_base[i])
df_base["intended_balcon_amount"] = np.where(df_base["intended_balcon_amount"] < 0,
np.nan, df_base["intended_balcon_amount"])
Este código realiza una copia del DataFrame original y luego modifica ciertas columnas específicas, reemplazando los valores -1 con NaN en algunas columnas y los valores negativos con NaN en la columna "intended_balcon_amount". De esta manera podemos trabajar de manera mas optima en el EDA, sabiendo con precision los valores nulos.
Dimensión
print(df_base.shape, df_base.drop_duplicates().shape)
(1000000, 32) (1000000, 32)
Tipos de datos
df_base.dtypes.to_dict()
{'fraud_bool': dtype('int64'),
'income': dtype('float64'),
'name_email_similarity': dtype('float64'),
'prev_address_months_count': dtype('float64'),
'current_address_months_count': dtype('float64'),
'customer_age': dtype('int64'),
'days_since_request': dtype('float64'),
'intended_balcon_amount': dtype('float64'),
'payment_type': dtype('O'),
'zip_count_4w': dtype('int64'),
'velocity_6h': dtype('float64'),
'velocity_24h': dtype('float64'),
'velocity_4w': dtype('float64'),
'bank_branch_count_8w': dtype('int64'),
'date_of_birth_distinct_emails_4w': dtype('int64'),
'employment_status': dtype('O'),
'credit_risk_score': dtype('int64'),
'email_is_free': dtype('int64'),
'housing_status': dtype('O'),
'phone_home_valid': dtype('int64'),
'phone_mobile_valid': dtype('int64'),
'bank_months_count': dtype('float64'),
'has_other_cards': dtype('int64'),
'proposed_credit_limit': dtype('float64'),
'foreign_request': dtype('int64'),
'source': dtype('O'),
'session_length_in_minutes': dtype('float64'),
'device_os': dtype('O'),
'keep_alive_session': dtype('int64'),
'device_distinct_emails_8w': dtype('int64'),
'device_fraud_count': dtype('int64'),
'month': dtype('int64')}
df_base_fraud_status = df_base['fraud_bool'] \
.value_counts(normalize=True) \
.mul(100).rename('percent').reset_index()
df_base_fraud_status_conteo = df_base['fraud_bool'].value_counts().reset_index()
df_base_fraud_status_pc = pd.merge(df_base_fraud_status, df_base_fraud_status_conteo, on=['fraud_bool'], how='inner')
df_base_fraud_status_pc
| fraud_bool | percent | count | |
|---|---|---|---|
| 0 | 0 | 98.8971 | 988971 |
| 1 | 1 | 1.1029 | 11029 |
Esta linea de codigo proporciona una visión detallada de la distribución de la columna 'fraud_bool', incluyendo el porcentaje de cada valor y la frecuencia absoluta.
fig = px.histogram(df_base_fraud_status_pc, x="fraud_bool", y=['percent'])
fig.show()
Cada barra del histograma representa un valor único en 'fraud_bool', y la altura de la barra indica el porcentaje de ocurrencia de ese valor en el conjunto de datos.
df_series_null_columns = df_base.isnull().sum().sort_values(ascending=False)
df_series_null_rows = df_base.isnull().sum(axis=1).sort_values(ascending=False)
print(df_series_null_columns.shape, df_series_null_rows.shape)
df_null_columnas = pd.DataFrame(df_series_null_columns, columns=['nulos_columnas'])
df_null_filas = pd.DataFrame(df_series_null_rows, columns=['nulos_filas'])
df_null_filas['target'] = df_base['fraud_bool'].copy()
df_null_columnas['porcentaje_columnas'] = df_null_columnas['nulos_columnas']/df_base.shape[0]
df_null_filas['porcentaje_filas']= df_null_filas['nulos_filas']/df_base.shape[1]
# # df_null
(32,) (1000000,)
Este código realiza un análisis detallado de los valores nulos en el DataFrame df_base, proporcionando información sobre la cantidad y el porcentaje de valores nulos en cada columna y fila, así como su relación con la variable objetivo 'fraud_bool'.
df_base.shape
(1000000, 32)
df_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| intended_balcon_amount | 742523 | 0.742523 |
| prev_address_months_count | 712920 | 0.712920 |
| bank_months_count | 253635 | 0.253635 |
| current_address_months_count | 4254 | 0.004254 |
| session_length_in_minutes | 2015 | 0.002015 |
| fraud_bool | 0 | 0.000000 |
| foreign_request | 0 | 0.000000 |
| phone_mobile_valid | 0 | 0.000000 |
| has_other_cards | 0 | 0.000000 |
| proposed_credit_limit | 0 | 0.000000 |
| device_os | 0 | 0.000000 |
| source | 0 | 0.000000 |
| housing_status | 0 | 0.000000 |
| keep_alive_session | 0 | 0.000000 |
| device_distinct_emails_8w | 0 | 0.000000 |
| device_fraud_count | 0 | 0.000000 |
| phone_home_valid | 0 | 0.000000 |
| credit_risk_score | 0 | 0.000000 |
| email_is_free | 0 | 0.000000 |
| income | 0 | 0.000000 |
| employment_status | 0 | 0.000000 |
| date_of_birth_distinct_emails_4w | 0 | 0.000000 |
| bank_branch_count_8w | 0 | 0.000000 |
| velocity_4w | 0 | 0.000000 |
| velocity_24h | 0 | 0.000000 |
| velocity_6h | 0 | 0.000000 |
| zip_count_4w | 0 | 0.000000 |
| payment_type | 0 | 0.000000 |
| days_since_request | 0 | 0.000000 |
| customer_age | 0 | 0.000000 |
| name_email_similarity | 0 | 0.000000 |
| month | 0 | 0.000000 |
threshold=0.9
list_vars_not_null = list(df_null_columnas[df_null_columnas['porcentaje_columnas']<threshold].index)
df_base_filter_null = df_base.loc[:, list_vars_not_null]
df_base_filter_null.shape
(1000000, 32)
Filtra las columnas del DataFrame original df_base y crea un nuevo DataFrame (df_base_filter_null) que retiene solo aquellas columnas que tienen un porcentaje de valores nulos inferior al umbral especificado (90%).
df_null_filas
| nulos_filas | target | porcentaje_filas | |
|---|---|---|---|
| 526010 | 4 | 0 | 0.125 |
| 771103 | 4 | 0 | 0.125 |
| 948524 | 4 | 0 | 0.125 |
| 979619 | 4 | 0 | 0.125 |
| 553607 | 4 | 0 | 0.125 |
| ... | ... | ... | ... |
| 200377 | 0 | 0 | 0.000 |
| 568248 | 0 | 0 | 0.000 |
| 916268 | 0 | 0 | 0.000 |
| 916266 | 0 | 0 | 0.000 |
| 500000 | 0 | 0 | 0.000 |
1000000 rows × 3 columns
list_cat_vars, other = dame_variables_categoricas(dataset=df_base_filter_null)
df_base_filter_null[list_cat_vars] = df_base_filter_null[list_cat_vars].astype("category")
df_base_filter_null[list_cat_vars].head()
| fraud_bool | device_os | source | housing_status | employment_status | payment_type | |
|---|---|---|---|---|---|---|
| 0 | 1 | windows | INTERNET | BA | CA | AA |
| 1 | 1 | windows | INTERNET | BA | CA | AB |
| 2 | 1 | other | INTERNET | BA | CB | AC |
| 3 | 1 | linux | INTERNET | BA | CA | AB |
| 4 | 1 | macintosh | INTERNET | BA | CA | AB |
Identifica las variables categóricas en df_base_filter_null, las convierte al tipo de dato "category" y luego imprime una vista previa de esas columnas.
list_cat_vars
['fraud_bool', 'device_os', 'source', 'housing_status', 'employment_status', 'payment_type']
df_base_filter_null['month'].value_counts()
month 3 150936 2 136979 0 132440 4 127691 1 127620 5 119323 6 108168 7 96843 Name: count, dtype: int64
df_base_filter_null['bank_months_count'].value_counts()
bank_months_count 1.0 194802 28.0 80082 15.0 59141 30.0 50777 31.0 46084 25.0 40450 10.0 37158 20.0 30850 21.0 29098 5.0 28001 2.0 25836 26.0 24779 11.0 23745 6.0 17678 29.0 11696 3.0 8580 19.0 8212 22.0 6741 9.0 4801 16.0 4563 27.0 4197 4.0 3834 24.0 1822 12.0 1121 7.0 931 14.0 493 18.0 491 23.0 320 32.0 46 8.0 30 17.0 4 13.0 2 Name: count, dtype: int64
df_base_filter_null[list_cat_vars].dtypes
fraud_bool category device_os category source category housing_status category employment_status category payment_type category dtype: object
df_base_filter_null[other].head(10)
| foreign_request | phone_mobile_valid | has_other_cards | keep_alive_session | device_distinct_emails_8w | device_fraud_count | phone_home_valid | credit_risk_score | email_is_free | date_of_birth_distinct_emails_4w | bank_branch_count_8w | zip_count_4w | customer_age | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 185 | 0 | 6 | 1 | 769 | 50 | 7 |
| 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 259 | 1 | 3 | 718 | 366 | 50 | 7 |
| 2 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 177 | 1 | 14 | 1 | 870 | 40 | 7 |
| 3 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 110 | 1 | 6 | 1921 | 810 | 50 | 7 |
| 4 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 295 | 1 | 2 | 1990 | 890 | 50 | 7 |
| 5 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 199 | 1 | 13 | 5 | 732 | 30 | 7 |
| 6 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 272 | 1 | 10 | 13 | 876 | 30 | 7 |
| 7 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 83 | 1 | 1 | 40 | 901 | 50 | 7 |
| 8 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 222 | 0 | 4 | 2134 | 933 | 40 | 7 |
| 9 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 118 | 1 | 2 | 8 | 1176 | 40 | 7 |
print(df_base_filter_null["current_address_months_count"].value_counts().count())
df_base_filter_null["current_address_months_count"]\
.apply(lambda x: str(x).lower().strip()).value_counts(normalize=True)
422
current_address_months_count 6.0 0.016376 7.0 0.016270 8.0 0.016089 5.0 0.015896 9.0 0.015688 4.0 0.015030 10.0 0.014739 11.0 0.013988 3.0 0.013986 12.0 0.013063 2.0 0.012741 13.0 0.011802 1.0 0.011297 14.0 0.010946 15.0 0.009699 0.0 0.009609 38.0 0.009021 40.0 0.008924 41.0 0.008835 37.0 0.008831 33.0 0.008811 34.0 0.008807 36.0 0.008795 39.0 0.008776 35.0 0.008767 42.0 0.008704 16.0 0.008615 43.0 0.008436 32.0 0.008424 31.0 0.008188 44.0 0.008165 45.0 0.008041 46.0 0.007994 30.0 0.007857 17.0 0.007634 29.0 0.007436 47.0 0.007429 49.0 0.007233 48.0 0.007198 28.0 0.007161 18.0 0.007074 27.0 0.006861 50.0 0.006818 26.0 0.006560 51.0 0.006408 52.0 0.006342 19.0 0.006337 25.0 0.006331 53.0 0.006180 20.0 0.006116 23.0 0.005948 24.0 0.005934 54.0 0.005930 21.0 0.005755 22.0 0.005747 55.0 0.005625 56.0 0.005599 57.0 0.005315 58.0 0.005218 59.0 0.004995 60.0 0.004867 61.0 0.004716 63.0 0.004604 62.0 0.004570 64.0 0.004515 65.0 0.004453 67.0 0.004391 69.0 0.004295 66.0 0.004269 nan 0.004254 68.0 0.004151 71.0 0.004145 70.0 0.004079 72.0 0.004052 73.0 0.004042 74.0 0.003920 75.0 0.003852 76.0 0.003698 77.0 0.003644 78.0 0.003588 79.0 0.003470 80.0 0.003385 81.0 0.003163 82.0 0.003146 83.0 0.003081 84.0 0.002967 85.0 0.002883 86.0 0.002824 188.0 0.002766 187.0 0.002740 87.0 0.002720 189.0 0.002700 88.0 0.002698 89.0 0.002671 186.0 0.002629 115.0 0.002593 191.0 0.002588 185.0 0.002572 184.0 0.002557 113.0 0.002548 190.0 0.002540 192.0 0.002527 90.0 0.002515 112.0 0.002499 91.0 0.002499 118.0 0.002488 111.0 0.002485 194.0 0.002483 122.0 0.002468 110.0 0.002455 107.0 0.002451 114.0 0.002449 193.0 0.002447 183.0 0.002441 117.0 0.002440 124.0 0.002437 116.0 0.002418 120.0 0.002416 101.0 0.002413 92.0 0.002409 105.0 0.002409 94.0 0.002401 108.0 0.002400 123.0 0.002388 106.0 0.002388 93.0 0.002383 99.0 0.002380 95.0 0.002378 109.0 0.002377 195.0 0.002373 119.0 0.002366 182.0 0.002365 98.0 0.002364 104.0 0.002349 125.0 0.002332 102.0 0.002330 128.0 0.002328 127.0 0.002317 96.0 0.002311 121.0 0.002308 103.0 0.002295 100.0 0.002259 126.0 0.002255 97.0 0.002254 129.0 0.002217 181.0 0.002185 130.0 0.002183 132.0 0.002178 135.0 0.002175 196.0 0.002143 136.0 0.002111 180.0 0.002098 131.0 0.002078 133.0 0.002062 134.0 0.002058 138.0 0.002054 137.0 0.002046 179.0 0.002038 197.0 0.002024 139.0 0.002008 142.0 0.001987 141.0 0.001976 178.0 0.001973 140.0 0.001952 143.0 0.001945 198.0 0.001928 144.0 0.001899 145.0 0.001890 147.0 0.001890 177.0 0.001873 148.0 0.001857 149.0 0.001850 146.0 0.001825 151.0 0.001824 153.0 0.001806 152.0 0.001800 155.0 0.001783 150.0 0.001774 156.0 0.001738 199.0 0.001712 154.0 0.001708 176.0 0.001703 157.0 0.001703 160.0 0.001688 175.0 0.001660 159.0 0.001649 158.0 0.001624 164.0 0.001593 161.0 0.001592 200.0 0.001560 162.0 0.001540 163.0 0.001532 174.0 0.001523 173.0 0.001498 165.0 0.001494 166.0 0.001471 170.0 0.001463 167.0 0.001456 171.0 0.001454 201.0 0.001446 172.0 0.001444 169.0 0.001411 168.0 0.001391 202.0 0.001306 203.0 0.001250 204.0 0.001093 238.0 0.001052 235.0 0.001015 236.0 0.001013 237.0 0.000988 233.0 0.000987 241.0 0.000985 242.0 0.000982 232.0 0.000978 231.0 0.000977 240.0 0.000965 230.0 0.000963 234.0 0.000963 247.0 0.000962 228.0 0.000961 229.0 0.000953 246.0 0.000953 239.0 0.000945 226.0 0.000944 227.0 0.000940 243.0 0.000938 370.0 0.000930 205.0 0.000920 244.0 0.000914 245.0 0.000903 367.0 0.000903 224.0 0.000897 368.0 0.000874 369.0 0.000871 251.0 0.000871 225.0 0.000871 222.0 0.000860 248.0 0.000853 250.0 0.000849 371.0 0.000841 365.0 0.000840 223.0 0.000838 254.0 0.000835 373.0 0.000826 253.0 0.000826 219.0 0.000823 252.0 0.000819 207.0 0.000815 249.0 0.000815 206.0 0.000812 221.0 0.000805 364.0 0.000805 366.0 0.000804 372.0 0.000795 220.0 0.000789 255.0 0.000785 256.0 0.000785 374.0 0.000773 217.0 0.000767 363.0 0.000758 208.0 0.000753 258.0 0.000738 257.0 0.000736 375.0 0.000734 262.0 0.000729 215.0 0.000724 218.0 0.000708 209.0 0.000705 210.0 0.000698 361.0 0.000698 362.0 0.000692 259.0 0.000687 376.0 0.000682 216.0 0.000676 377.0 0.000674 261.0 0.000673 260.0 0.000672 213.0 0.000667 211.0 0.000666 212.0 0.000665 264.0 0.000626 214.0 0.000617 263.0 0.000617 360.0 0.000613 378.0 0.000612 265.0 0.000610 267.0 0.000598 268.0 0.000589 266.0 0.000587 359.0 0.000569 269.0 0.000559 358.0 0.000535 270.0 0.000532 278.0 0.000518 271.0 0.000515 299.0 0.000515 272.0 0.000513 275.0 0.000510 379.0 0.000506 302.0 0.000500 284.0 0.000498 277.0 0.000494 274.0 0.000494 273.0 0.000493 295.0 0.000492 296.0 0.000492 294.0 0.000488 380.0 0.000487 288.0 0.000483 297.0 0.000479 291.0 0.000475 276.0 0.000474 289.0 0.000474 301.0 0.000474 310.0 0.000469 309.0 0.000465 304.0 0.000465 290.0 0.000465 298.0 0.000464 307.0 0.000464 286.0 0.000462 311.0 0.000456 303.0 0.000454 280.0 0.000453 356.0 0.000453 287.0 0.000452 357.0 0.000452 312.0 0.000446 282.0 0.000446 306.0 0.000444 281.0 0.000444 300.0 0.000442 279.0 0.000436 308.0 0.000435 292.0 0.000435 293.0 0.000429 305.0 0.000428 315.0 0.000424 313.0 0.000422 321.0 0.000422 381.0 0.000419 283.0 0.000417 285.0 0.000417 317.0 0.000406 316.0 0.000401 319.0 0.000401 355.0 0.000398 314.0 0.000392 325.0 0.000391 318.0 0.000391 322.0 0.000377 324.0 0.000376 320.0 0.000374 327.0 0.000363 323.0 0.000360 326.0 0.000351 382.0 0.000343 354.0 0.000332 328.0 0.000331 383.0 0.000331 332.0 0.000328 331.0 0.000324 329.0 0.000322 352.0 0.000308 333.0 0.000290 335.0 0.000287 338.0 0.000287 353.0 0.000287 330.0 0.000286 334.0 0.000284 336.0 0.000278 384.0 0.000277 337.0 0.000267 343.0 0.000254 339.0 0.000254 385.0 0.000248 340.0 0.000243 349.0 0.000242 351.0 0.000234 344.0 0.000229 350.0 0.000229 348.0 0.000226 346.0 0.000223 341.0 0.000221 345.0 0.000215 347.0 0.000209 342.0 0.000204 386.0 0.000160 387.0 0.000131 388.0 0.000115 389.0 0.000110 390.0 0.000064 391.0 0.000048 392.0 0.000038 393.0 0.000021 394.0 0.000017 395.0 0.000015 396.0 0.000010 398.0 0.000008 397.0 0.000008 399.0 0.000007 410.0 0.000007 402.0 0.000005 407.0 0.000005 401.0 0.000005 408.0 0.000004 406.0 0.000004 400.0 0.000004 404.0 0.000004 412.0 0.000004 405.0 0.000003 425.0 0.000002 418.0 0.000002 411.0 0.000002 416.0 0.000002 417.0 0.000001 424.0 0.000001 419.0 0.000001 403.0 0.000001 428.0 0.000001 413.0 0.000001 414.0 0.000001 409.0 0.000001 Name: proportion, dtype: float64
Normaliza la columna "current_address_months_count".
df_base_filter_null["income"][df_base_filter_null["income"] == 0.6000000000000001] = 0.6
df_base_filter_null["income"][df_base_filter_null["income"] == 0.7000000000000001]=0.7
C:\Users\jaime\AppData\Local\Temp\ipykernel_9648\1868583841.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\jaime\AppData\Local\Temp\ipykernel_9648\1868583841.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Redondeamos los valores.
for i in list(df_base_filter_null.columns):
if df_base_filter_null[i].dtype == "int64":
df_base_filter_null[i] = df_base_filter_null[i].astype("float64")
Convierte todas las columnas tipo int64 en float64 para poder trabajar correctamente con ellas en la parte de preprocesamiento de datos.
df_base_filter_null.to_csv("../data/pd_data_initial_preprocessing.csv")